In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import plotly.express as px
In [2]:
try:
df=pd.read_excel("financial_loan.xlsx")
except FileNotFoundError as e:
print(e)
Understanding & Data Cleaning¶
In [3]:
df
Out[3]:
| id | address_state | application_type | emp_length | emp_title | grade | home_ownership | issue_date | last_credit_pull_date | last_payment_date | ... | sub_grade | term | verification_status | annual_income | dti | installment | int_rate | loan_amount | total_acc | total_payment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1077430 | GA | INDIVIDUAL | < 1 year | Ryder | C | RENT | 2021-02-11 | 2021-09-13 | 2021-04-13 | ... | C4 | 60 months | Source Verified | 30000.0 | 0.0100 | 59.83 | 0.1527 | 2500 | 4 | 1009 |
| 1 | 1072053 | CA | INDIVIDUAL | 9 years | MKC Accounting | E | RENT | 2021-01-01 | 2021-12-14 | 2021-01-15 | ... | E1 | 36 months | Source Verified | 48000.0 | 0.0535 | 109.43 | 0.1864 | 3000 | 4 | 3939 |
| 2 | 1069243 | CA | INDIVIDUAL | 4 years | Chemat Technology Inc | C | RENT | 2021-01-05 | 2021-12-12 | 2021-01-09 | ... | C5 | 36 months | Not Verified | 50000.0 | 0.2088 | 421.65 | 0.1596 | 12000 | 11 | 3522 |
| 3 | 1041756 | TX | INDIVIDUAL | < 1 year | barnes distribution | B | MORTGAGE | 2021-02-25 | 2021-12-12 | 2021-03-12 | ... | B2 | 60 months | Source Verified | 42000.0 | 0.0540 | 97.06 | 0.1065 | 4500 | 9 | 4911 |
| 4 | 1068350 | IL | INDIVIDUAL | 10+ years | J&J Steel Inc | A | MORTGAGE | 2021-01-01 | 2021-12-14 | 2021-01-15 | ... | A1 | 36 months | Verified | 83000.0 | 0.0231 | 106.53 | 0.0603 | 3500 | 28 | 3835 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 38571 | 803452 | NJ | INDIVIDUAL | < 1 year | Joseph M Sanzari Company | C | MORTGAGE | 2021-07-11 | 2021-05-16 | 2021-05-16 | ... | C1 | 60 months | Verified | 100000.0 | 0.1986 | 551.64 | 0.1299 | 24250 | 33 | 31946 |
| 38572 | 970377 | NY | INDIVIDUAL | 8 years | Swat Fame | C | RENT | 2021-10-11 | 2021-04-16 | 2021-05-16 | ... | C1 | 60 months | Verified | 50000.0 | 0.0458 | 579.72 | 0.1349 | 25200 | 18 | 31870 |
| 38573 | 875376 | CA | INDIVIDUAL | 5 years | Anaheim Regional Medical Center | D | RENT | 2021-09-11 | 2021-05-16 | 2021-05-16 | ... | D5 | 60 months | Verified | 65000.0 | 0.1734 | 627.93 | 0.1749 | 25000 | 20 | 35721 |
| 38574 | 972997 | NY | INDIVIDUAL | 5 years | Brooklyn Radiology | D | RENT | 2021-10-11 | 2021-05-16 | 2021-05-16 | ... | D5 | 60 months | Verified | 368000.0 | 0.0009 | 612.72 | 0.1825 | 24000 | 9 | 33677 |
| 38575 | 682952 | NY | INDIVIDUAL | 4 years | Allen Edmonds | F | RENT | 2021-07-11 | 2021-05-16 | 2021-05-16 | ... | F3 | 60 months | Verified | 80000.0 | 0.0600 | 486.86 | 0.2099 | 18000 | 7 | 27679 |
38576 rows × 24 columns
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 38576 entries, 0 to 38575 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 38576 non-null int64 1 address_state 38576 non-null object 2 application_type 38576 non-null object 3 emp_length 38576 non-null object 4 emp_title 37138 non-null object 5 grade 38576 non-null object 6 home_ownership 38576 non-null object 7 issue_date 38576 non-null datetime64[ns] 8 last_credit_pull_date 38576 non-null datetime64[ns] 9 last_payment_date 38576 non-null datetime64[ns] 10 loan_status 38576 non-null object 11 next_payment_date 38576 non-null datetime64[ns] 12 member_id 38576 non-null int64 13 purpose 38576 non-null object 14 sub_grade 38576 non-null object 15 term 38576 non-null object 16 verification_status 38576 non-null object 17 annual_income 38576 non-null float64 18 dti 38576 non-null float64 19 installment 38576 non-null float64 20 int_rate 38576 non-null float64 21 loan_amount 38576 non-null int64 22 total_acc 38576 non-null int64 23 total_payment 38576 non-null int64 dtypes: datetime64[ns](4), float64(4), int64(5), object(11) memory usage: 7.1+ MB
In [5]:
df[df.duplicated()]
Out[5]:
| id | address_state | application_type | emp_length | emp_title | grade | home_ownership | issue_date | last_credit_pull_date | last_payment_date | ... | sub_grade | term | verification_status | annual_income | dti | installment | int_rate | loan_amount | total_acc | total_payment |
|---|
0 rows × 24 columns
In [6]:
df.isnull().sum()
Out[6]:
id 0 address_state 0 application_type 0 emp_length 0 emp_title 1438 grade 0 home_ownership 0 issue_date 0 last_credit_pull_date 0 last_payment_date 0 loan_status 0 next_payment_date 0 member_id 0 purpose 0 sub_grade 0 term 0 verification_status 0 annual_income 0 dti 0 installment 0 int_rate 0 loan_amount 0 total_acc 0 total_payment 0 dtype: int64
In [7]:
catcols=df.select_dtypes(include=['object','category']).columns.tolist()
for i in catcols:
print(df[i].value_counts())
print('--------------------------------------------')
address_state
CA 6894
NY 3701
FL 2773
TX 2664
NJ 1822
IL 1486
PA 1482
VA 1375
GA 1355
MA 1310
OH 1188
MD 1027
AZ 833
WA 805
CO 770
NC 759
CT 730
MI 685
MO 660
MN 592
NV 482
SC 464
WI 446
OR 436
AL 432
LA 426
KY 320
OK 293
KS 260
UT 252
AR 236
DC 214
RI 196
NM 183
HI 170
WV 167
NH 161
DE 110
MT 79
WY 79
AK 78
SD 63
VT 54
MS 19
TN 17
IN 9
ID 6
NE 5
IA 5
ME 3
Name: count, dtype: int64
--------------------------------------------
application_type
INDIVIDUAL 38576
Name: count, dtype: int64
--------------------------------------------
emp_length
10+ years 8870
< 1 year 4575
2 years 4382
3 years 4088
4 years 3428
5 years 3273
1 year 3229
6 years 2228
7 years 1772
8 years 1476
9 years 1255
Name: count, dtype: int64
--------------------------------------------
emp_title
US Army 135
Bank of America 109
IBM 67
AT&T 63
Wells Fargo 57
...
CH Newton Builders, Inc 1
NGS Associates, Inc. 1
Ridgecrest Capital Partners 1
rice &werthmann 1
Five County Assoc of Governments 1
Name: count, Length: 28525, dtype: int64
--------------------------------------------
grade
B 11674
A 9689
C 7904
D 5182
E 2786
F 1028
G 313
Name: count, dtype: int64
--------------------------------------------
home_ownership
RENT 18439
MORTGAGE 17198
OWN 2838
OTHER 98
NONE 3
Name: count, dtype: int64
--------------------------------------------
loan_status
Fully Paid 32145
Charged Off 5333
Current 1098
Name: count, dtype: int64
--------------------------------------------
purpose
Debt consolidation 18214
credit card 4998
other 3824
home improvement 2876
major purchase 2110
small business 1776
car 1497
wedding 928
medical 667
moving 559
house 366
vacation 352
educational 315
renewable_energy 94
Name: count, dtype: int64
--------------------------------------------
sub_grade
B3 2834
A4 2803
A5 2654
B5 2644
B4 2455
C1 2089
B2 1990
C2 1972
B1 1751
A3 1740
C3 1490
A2 1440
D2 1314
C4 1202
C5 1151
D3 1144
A1 1052
D4 960
D1 913
D5 851
E1 750
E2 640
E3 538
E4 448
E5 410
F1 325
F2 243
F3 182
F4 163
F5 115
G1 101
G2 78
G4 56
G3 48
G5 30
Name: count, dtype: int64
--------------------------------------------
term
36 months 28237
60 months 10339
Name: count, dtype: int64
--------------------------------------------
verification_status
Not Verified 16464
Verified 12335
Source Verified 9777
Name: count, dtype: int64
--------------------------------------------
Exploratory Data Analysis (EDA)¶
Q: Total Loan Applications¶
In [8]:
print('Total Loan Applications:',df['id'].count())
Total Loan Applications: 38576
Q: MTD (Month-to-Date) Total Loan Applications¶
In [9]:
df['IDYM']=df['issue_date'].dt.strftime('%Y-%m')
lma=df[df['issue_date'].dt.to_period('M')==df['IDYM'].max()].shape[0]
print(f'Latest month Applications ({(df['IDYM'].max())}): {lma}')
Latest month Applications (2021-12): 4314
Q: MOM (Month-on-Month) Loan Applications¶
In [10]:
mom=df['IDYM'].value_counts().reset_index(name='TA').sort_values(by='IDYM')
plt.figure(figsize=(15,5))
plt.fill_between(mom['IDYM'],mom['TA'],color='skyblue',alpha=0.5)
plt.plot(mom['IDYM'],mom['TA'],marker='.',color='blue')
for x,y in zip(mom['IDYM'],mom['TA']):
plt.text(x,y+50,str(y),ha='center',va='bottom')
plt.title('No. of Loan Applications per Month')
plt.xlabel('Year-Months')
plt.ylabel('Loan Applications')
plt.ylim(2000,4500)
plt.tight_layout()
plt.grid(linestyle='--',alpha=0.6)
plt.show()
Q: Total Funded Amount¶
In [11]:
print(f'Total Funded Amount: ${(df['loan_amount'].sum()/1000000).round(2)} M')
Total Funded Amount: $435.76 M
Q: MOM Total Funded Amount¶
In [12]:
mfi=df.groupby('IDYM')['loan_amount'].sum().reset_index(name='TLA').sort_values(by='IDYM')
mfi
Out[12]:
| IDYM | TLA | |
|---|---|---|
| 0 | 2021-01 | 25031650 |
| 1 | 2021-02 | 24647825 |
| 2 | 2021-03 | 28875700 |
| 3 | 2021-04 | 29800800 |
| 4 | 2021-05 | 31738350 |
| 5 | 2021-06 | 34161475 |
| 6 | 2021-07 | 35813900 |
| 7 | 2021-08 | 38149600 |
| 8 | 2021-09 | 40907725 |
| 9 | 2021-10 | 44893800 |
| 10 | 2021-11 | 47754825 |
| 11 | 2021-12 | 53981425 |
In [13]:
plt.figure(figsize=(15,5))
mfi['TLAM']=round(mfi['TLA']/1000000,2)
plt.fill_between(mfi['IDYM'],mfi['TLAM'],color='tomato',alpha=0.3)
plt.plot(mfi['IDYM'],mfi['TLAM'],marker='.',color='orangered')
for x,y in zip(mfi['IDYM'],mfi['TLAM']):
plt.text(x,y+1,'$'+str(y)+'M',ha='center',va='bottom')
plt.title('Total Amount Funded per Month')
plt.xlabel('Year-Months')
plt.ylabel('Amount in Millions')
plt.ylim(20,60)
plt.grid(linestyle='--',alpha=0.6)
plt.show()
Q: Total Amount Received¶
In [14]:
print(f'Total Amount Received: ${round(df['total_payment'].sum()/1000000,2)}M')
Total Amount Received: $473.07M
Q: Average Interest Rate¶
In [15]:
print(f'Average Interest Rate: {(df['int_rate'].mean()*100).round(2)}%')
Average Interest Rate: 12.05%
In [16]:
#### Q: Average DTI (Debt-to-Income) Ratio:
print(f'Average DTI Ratio: {(df['dti'].mean()*100).round(2)}%')
Average DTI Ratio: 13.33%
Q: Good Loan Matrix¶
In [17]:
gl=df[df['loan_status'].isin(['Fully Paid','Current'])]
print(f'No. of GoodLoans: {gl['id'].count()}')
print(f'Funded Amt for GoodLoans: ${round(gl['loan_amount'].sum()/1000000,2)} M')
print(f'GoodLoans Amt Recieved: ${round(gl['total_payment'].sum()/1000000,2)} M')
print(f'GoodLoans Applications: {round(gl['id'].count()/df['id'].count()*100)}%')
No. of GoodLoans: 33243 Funded Amt for GoodLoans: $370.22 M GoodLoans Amt Recieved: $435.79 M GoodLoans Applications: 86%
Q: Bad Loan Matrix¶
In [18]:
bl=df[df['loan_status']=='Charged Off']
print(f'No. of BadLoans: {bl['id'].count()}')
print(f'Funded Amt for BadLoans: ${round(bl['loan_amount'].sum()/1000000,2)} M')
print(f'Loss Due to BadLoans: ${round(bl['total_payment'].sum()/1000000,2)} M')
print(f'Percentage of Loss: {round(bl['total_payment'].sum()/df['loan_amount'].sum()*100,2)}%')
print(f'BadLoans Applications: {round(bl['id'].count()/df['id'].count()*100)}%')
No. of BadLoans: 5333 Funded Amt for BadLoans: $65.53 M Loss Due to BadLoans: $37.28 M Percentage of Loss: 8.56% BadLoans Applications: 14%
Q: Regional Analysis by State for Total Loan Applications¶
In [19]:
rta=df.groupby('address_state')['id'].count().reset_index(name='TLA').sort_values(by='TLA',ascending=False)
plt.figure(figsize=(15,10))
bar=plt.barh(rta['address_state'],rta['TLA'],color='skyblue',height=0.6)
plt.gca().invert_yaxis()
plt.title('Total Loan Applications per State')
plt.ylabel('State',fontsize=12)
plt.xlabel('Number of Loan Applications',fontsize=12)
plt.bar_label(bar,
labels=[f"{v:,}" for v in rta['TLA']],
fontsize=10,
padding=5)
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
Q: Regional Analysis by State for Total Fund Allocation¶
In [20]:
rfa=df.groupby('address_state')['loan_amount'].sum().reset_index(name='TFA').sort_values(by='TFA',ascending=False)
plt.figure(figsize=(15,10))
bar=plt.barh(rfa['address_state'],rfa['TFA']//1000000,color='lightcoral',height=0.6)
plt.gca().invert_yaxis()
plt.title('Total Fund Allocations per State ($)')
plt.ylabel('State',fontsize=12)
plt.xlabel('Amount in Millions',fontsize=12)
plt.bar_label(bar,
labels=[f"{v}K" for v in (rfa['TFA']//1000)],
fontsize=10,
padding=5)
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
Q: Regional Analysis by State for Total Amount Received¶
In [21]:
rar=df.groupby('address_state')['total_payment'].sum().reset_index(name='TAR').sort_values(by='TAR',ascending=False)
plt.figure(figsize=(15,10))
bar=plt.barh(rar['address_state'],rar['TAR']//1000000,color='green',height=0.6,alpha=0.7)
plt.gca().invert_yaxis()
plt.title('Total Amount Received per State ($)')
plt.ylabel('State',fontsize=12)
plt.xlabel('Amount in Millions',fontsize=12)
plt.bar_label(bar,
labels=[f"{v}K" for v in (rar['TAR']//1000)],
fontsize=10,
padding=5)
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
Q: Total Loan Application by Term¶
In [22]:
lat=df.groupby('term')['id'].count()
plt.figure(figsize=(5,5))
plt.pie(lat,labels=lat.index,autopct=lambda p: f'{p:.0f}%\n({int(p*lat.sum()/100)})',colors=['darkcyan','mediumturquoise'],startangle=90)
plt.title('Number of Loan Applications by Tenure')
plt.show()
Q: Total Funded Amount by Term¶
In [23]:
fat=df.groupby('term')['loan_amount'].sum()
plt.figure(figsize=(5,5))
plt.pie(fat,labels=fat.index,autopct=lambda p: f'{p:.0f}%\n(${p*fat.sum()/100/1e6:.1f}M)',colors=['orangered','coral'],startangle=90)
plt.title('Total Fund Allocated by Tenure')
plt.show()
Q: Total Amount Received by Term¶
In [24]:
art=df.groupby('term')['total_payment'].sum()
plt.figure(figsize=(5,5))
plt.pie(art,labels=fat.index,autopct=lambda p: f'{p:.0f}%\n(${p*fat.sum()/100/1e6:.1f}M)',colors=['olivedrab','greenyellow'],startangle=90)
plt.title('Total Amount Received by Tenure')
plt.show()
In [25]:
df['cov']=np.where(df['loan_status']=='Charged Off',1,0)
clt=df.groupby('term')['cov'].sum()
plt.figure(figsize=(5,5))
plt.pie(clt,labels=clt.index,startangle=90,colors=['darkred','tomato'],autopct=lambda p: f'{p:.0f}%\n({int(p*clt.sum()/100)})')
plt.title('Number of Loans Defaulted by Tenure')
plt.show()
Q: Total Loan Applications by Employee Len¶
In [26]:
ela=df.groupby('emp_length')['id'].count().reset_index(name='TA').sort_values(by='TA',ascending=True)
plt.figure(figsize=(15,5))
bar=plt.barh(ela['emp_length'],ela['TA'],color=plt.cm.Blues(ela['TA']/ela['TA'].max()))
plt.title("Number of Loan Applications by Client's Work Experience")
plt.xlabel("Number of Applications")
plt.ylabel("Work Experience")
plt.bar_label(bar,padding=5)
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
Q: Loan Defaults as per Employee Len¶
In [27]:
cee=df.groupby('emp_length').agg(TLA=('id','count'),NLD=('cov','sum')).reset_index()
cee['Percent']=round(cee['NLD']/cee['TLA']*100)
cee = cee.sort_values('Percent', ascending=True)
plt.figure(figsize=(20,6))
bars = plt.barh(
cee['emp_length'],
cee['Percent'],
color=plt.cm.Reds(cee['Percent'] / cee['Percent'].max())
)
plt.title("Number of Loans Defaulted & Percentage by Client's Work Experience")
plt.xlabel("Default Rate (%)")
plt.ylabel("Work Experience")
plt.xlim(0,20)
for bar, nld, pct in zip(bars, cee['NLD'], cee['Percent']):
plt.text(
bar.get_width()+2.5,
bar.get_y() + bar.get_height() / 2,
f"Defaulted: {nld} | {pct:.0f}%",
va='center',
ha='right',
fontsize=10,
)
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.subplots_adjust(left=0.3, right=0.95)
plt.show()
Q: Total Loan Applications by purpose¶
In [28]:
tap=df.groupby('purpose').agg(TotalLoans=('id','count')).reset_index().sort_values(by='TotalLoans',ascending=True)
plt.figure(figsize=(15,5))
bar=plt.barh(tap['purpose'],tap['TotalLoans'],color=plt.cm.viridis(tap['TotalLoans']/tap['TotalLoans'].max()))
plt.title("Number of Loan Applications by Purpose")
plt.xlabel("Number of Loan Applications")
plt.ylabel("Purposes")
plt.bar_label(bar,padding=5)
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
Q: Total Amount Funded by purpose¶
In [29]:
taf=df.groupby('purpose').agg(FundedAmt=('loan_amount','sum')).reset_index().sort_values(by='FundedAmt',ascending=True)
plt.figure(figsize=(15,5))
bar=plt.barh(taf['purpose'],taf['FundedAmt']//1000,color=plt.cm.Wistia(taf['FundedAmt']/taf['FundedAmt'].max()))
plt.title("Total Amount Funded by Purpose ($)")
plt.xlabel(" Amount Funded")
plt.ylabel("Purposes")
plt.bar_label(bar,labels=[f'{v}k' for v in taf['FundedAmt']//1000],padding=5)
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
Q: Loans Defaults by Purpose¶
In [30]:
ldp=df.groupby('purpose').agg(TLA=('id','count'),NLD=('cov','sum')).reset_index()
ldp['Percent']=round(ldp['NLD']/ldp['TLA']*100)
ldp = ldp.sort_values('Percent', ascending=True)
plt.figure(figsize=(15,5))
colors=plt.cm.autumn(np.linspace(1,0,ldp.shape[0]))
bar=plt.barh(ldp['purpose'],ldp['Percent'],color=colors)
plt.title("Number of Loan Defaults & Percentage by Purpose of Loan Required")
plt.xlabel("Number of Loan Defaults")
plt.ylabel("Purposes")
plt.xlim(0,35)
for bar,x,y in zip(bar,ldp['NLD'],ldp['Percent']):
plt.text(
bar.get_width()+4.5,
bar.get_y()+bar.get_height()/2,
f'Defaulted: {x} | {y:.0f}%',
va='center',
ha='right',
fontsize=10,
)
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
Q: Loan Applications by Home Ownership¶
In [31]:
aho=df.groupby('home_ownership')['id'].count().reset_index(name='TA')
plt.figure(figsize=(15,8))
fig=px.treemap(
aho,
path=['home_ownership'],
color='TA',
values='TA',
color_continuous_scale='Blues',
title='Number of Loan Applications by Home Ownership'
)
fig.update_traces(
textinfo='label+value',
textfont_size=12
)
fig.show()
<Figure size 1500x800 with 0 Axes>
Q: Total Amount Funded by Home Ownership¶
In [32]:
aho=df.groupby('home_ownership')['loan_amount'].sum().reset_index(name='TAF')
plt.figure(figsize=(15,8))
fig=px.treemap(
aho,
path=['home_ownership'],
color='TAF',
values='TAF',
color_continuous_scale='Oranges',
title='Total Amount Funded by Home Ownership'
)
fig.update_traces(
textinfo='label+value',
textfont_size=12
)
fig.show()
<Figure size 1500x800 with 0 Axes>
Q: No. of Loan Defaults by Home Ownership¶
In [33]:
ldho=df.groupby('home_ownership').agg(TLA=('id','count'),NLD=('cov','sum')).reset_index()
ldho['Percent']=round(ldho['NLD']/ldho['TLA']*100)
ldho=ldho.query("Percent>0")
plt.figure(figsize=(15,8))
fig=px.treemap(
ldho,
path=['home_ownership'],
color='Percent',
values='Percent',
custom_data=['TLA', 'NLD', 'Percent'],
color_continuous_scale='Reds',
title='Loans Defaulted by Home Ownership'
)
fig.update_traces(
texttemplate=
"%{label}<br><br>"
"Total Applications: %{customdata[0]}<br>"
"Loans Defaulted: %{customdata[1]}<br>"
"Percentage: %{customdata[2]}%<br>",
textfont_size=12
)
fig.show()
<Figure size 1500x800 with 0 Axes>
Q: No. of Loans Default due to sources¶
In [34]:
lds=df.groupby('verification_status')['cov'].sum().reset_index(name='NLD').sort_values(by='NLD',ascending=True)
plt.figure(figsize=(12,3))
bar=plt.barh(lds['verification_status'],lds['NLD'],height=0.4,color='purple')
plt.title("Number of Loans Defaulted by Sources")
plt.xlabel("Number of Loans Defaulted")
plt.ylabel("Verification Status")
plt.bar_label(bar,padding=5)
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
Q: Overview of Loan Applications¶
In [35]:
lts = df.groupby('loan_status')['id'].count()
plt.figure(figsize=(5,5))
colors = ['#2E7D32', '#C62828', '#F9A825']
explode = [0.08,0,0]
plt.pie(
lts,
labels=lts.index,
autopct=lambda p: f'{p:.0f}%\n({int(p*lts.sum()/100):,})',
colors=colors,
explode=explode,
startangle=90,
counterclock=False,
pctdistance=0.80,
labeldistance=1.05,
wedgeprops=dict(width=0.4, edgecolor='white')
)
plt.text(0, 0, "LOANS\nSTATUS", ha='center', va='center',
fontsize=14, fontweight='bold')
plt.title("Overview of Bank's Loans", fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()
Q: State wise Loan Defaults.¶
In [36]:
sld=df.groupby('address_state',as_index=False).agg(TC=('id','count'),TLD=('cov','sum'))
sld['Percent%']=round(sld['TLD']/sld['TC']*100)
In [37]:
sld_sorted = sld.sort_values(by='Percent%', ascending=False)
colors = plt.cm.Reds(
sld_sorted['Percent%'] / sld_sorted['Percent%'].max()
)
plt.figure(figsize=(12,10))
bars = plt.barh(
sld_sorted['address_state'],
sld_sorted['Percent%'],
color=colors,
height=0.6
)
plt.gca().invert_yaxis()
plt.title('Volume & Rate of Defaulted Loans per State', fontsize=12, fontweight='bold')
plt.xlabel('Defaulted Rate (%)', fontsize=12)
plt.ylabel('State', fontsize=12)
plt.xlim(0,100)
for bar, d, p in zip(bars, sld_sorted['TLD'], sld_sorted['Percent%']):
plt.text(
bar.get_width()+1,
bar.get_y() + bar.get_height()/2,
f'Defaults: {d} | {p:.0f}%',
va='center',
fontsize=9
)
plt.grid(axis='x', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
Q: Loan Defaults by total_acc¶
In [47]:
tald=df.groupby('total_acc')['cov'].sum().reset_index(name='TLD')
tald['Ratio']=round(tald['TLD']/df['cov'].sum()*100,2)
tald.query("Ratio>1")
Out[47]:
| total_acc | TLD | Ratio | |
|---|---|---|---|
| 2 | 4 | 72 | 1.35 |
| 3 | 5 | 84 | 1.58 |
| 4 | 6 | 98 | 1.84 |
| 5 | 7 | 123 | 2.31 |
| 6 | 8 | 162 | 3.04 |
| 7 | 9 | 152 | 2.85 |
| 8 | 10 | 170 | 3.19 |
| 9 | 11 | 184 | 3.45 |
| 10 | 12 | 186 | 3.49 |
| 11 | 13 | 181 | 3.39 |
| 12 | 14 | 217 | 4.07 |
| 13 | 15 | 187 | 3.51 |
| 14 | 16 | 216 | 4.05 |
| 15 | 17 | 194 | 3.64 |
| 16 | 18 | 174 | 3.26 |
| 17 | 19 | 189 | 3.54 |
| 18 | 20 | 179 | 3.36 |
| 19 | 21 | 169 | 3.17 |
| 20 | 22 | 149 | 2.79 |
| 21 | 23 | 160 | 3.00 |
| 22 | 24 | 175 | 3.28 |
| 23 | 25 | 140 | 2.63 |
| 24 | 26 | 113 | 2.12 |
| 25 | 27 | 131 | 2.46 |
| 26 | 28 | 135 | 2.53 |
| 27 | 29 | 115 | 2.16 |
| 28 | 30 | 125 | 2.34 |
| 29 | 31 | 99 | 1.86 |
| 30 | 32 | 82 | 1.54 |
| 31 | 33 | 107 | 2.01 |
| 32 | 34 | 71 | 1.33 |
| 33 | 35 | 82 | 1.54 |
| 34 | 36 | 82 | 1.54 |
| 36 | 38 | 67 | 1.26 |
| 37 | 39 | 62 | 1.16 |
In [49]:
tald['total_acc'].corr(tald['Ratio'])
Out[49]:
np.float64(-0.8117562018798052)
In [50]:
# There is no relation between total no. of accounts and loan defaults.
Q: Loan Applications by Income category¶
In [87]:
def ic(val):
if val<=50000:
return 'Lower Class'
elif val>50000 and val<=100000:
return 'Lower Middle Class'
elif val>100000 and val<=500000:
return 'Middle Class'
elif val>500000 and val<=2500000:
return 'Upper Middle Class'
else:
return 'Super Class'
df['IncCat']=df['annual_income'].apply(ic)
taic=df.groupby('IncCat')['id'].count().reset_index(name='TLA').sort_values(by='TLA',ascending=True)
plt.figure(figsize=(15,5))
bar=plt.barh(taic['IncCat'],taic['TLA'],color=plt.cm.Blues(taic['TLA']/taic['TLA'].max()))
plt.title("Number of Loan Applications by Income Category")
plt.xlabel("Number of Applications")
plt.ylabel("Income Category")
plt.xlim(0,25000)
plt.bar_label(bar,padding=5)
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
Q: Loan defaulted by Income Category¶
In [117]:
ldic=df.groupby('IncCat')['cov'].sum().astype(int).reset_index(name='TLD').sort_values(by='TLD')
ldic=pd.merge(ldic,taic,on='IncCat',how='left')
ldic['DefaultRate']=(ldic['TLD']/ldic['TLA']*100).astype(int)
ldic=ldic.sort_values(by='DefaultRate')
plt.figure(figsize=(20,6))
bars = plt.barh(
ldic['IncCat'],
ldic['DefaultRate'],
color=plt.cm.Reds(ldic['DefaultRate'] / ldic['DefaultRate'].max())
)
plt.title("Number of Loans Defaulted & Percentage by Income Category")
plt.xlabel("Default Rate (%)")
plt.ylabel("Income Category")
plt.xlim(0,20)
for bar, nld, pct in zip(bars, ldic['TLD'], ldic['DefaultRate']):
plt.text(
bar.get_width()+2.5,
bar.get_y() + bar.get_height() / 2,
f"Defaulted: {nld} | {pct:.0f}%",
va='center',
ha='right',
fontsize=10,
)
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.subplots_adjust(left=0.3, right=0.95)
plt.show()